In this markdown document, we will demonstrate how you could visualize the different methods for clustering and dimensionality reduction, as stored in the data frame (the output from the clustering_dimensionalityreduction_pseudotime script/markdown)
As input we will use the deposited ‘df.csv’ file, which contains the expression values of 275856 CD4 T cells, HSNE-based clustres, flowsom clusters, phenograph clusters, umap coordinates, diffusion map coordinates and pseudotime values.
library(dplyr)
library(ggplot2)
library(scales)
library(reshape2)
library(RColorBrewer)
library(ggrepel)
## FSC.A FSC.H FSC.W SSC.A SSC.H SSC.W CD95 CD8
## 1 123725.3 114537 70793.39 29916.04 29725 65957.20 0.04919554 -0.04211360
## 2 129744.8 113527 74898.12 37344.06 35011 69903.19 0.06008087 -0.07160002
## 3 134425.1 122187 72100.02 29106.00 28719 66419.13 -0.08300304 -0.05722039
## 4 129715.4 117286 72481.21 38762.57 36082 70404.73 0.02296851 -0.22416283
## 5 135594.0 118113 75235.51 36312.43 34373 69233.75 -0.06238258 0.07444189
## 6 109618.7 98183 73169.21 24938.76 24099 67819.69 0.18458755 0.09685653
## CD27 CXCR4 CCR7 LIVE.DEAD CD4 CD45RA CD3 CD49b
## 1 2.043456 -154.76010 1.617012 418.72021 1.115082 1.509235 2.494015 -0.15329744
## 2 1.899140 -55.10397 1.295678 62.95781 1.473094 1.649236 2.228497 0.13627116
## 3 2.095128 -138.39290 1.184099 773.71948 1.358547 1.497685 2.266846 0.04310955
## 4 1.978630 66.90539 1.450845 194.00330 1.453925 1.691411 2.390522 0.09496491
## 5 2.137328 88.51750 1.517236 244.99280 1.276686 1.428678 2.169388 -0.07254910
## 6 2.151697 -25.66885 1.486741 778.22522 1.317697 1.639596 2.038711 -0.06887352
## CD14.19 CD69 CD103 TIME CD95_150 CD8_150 CD27_150
## 1 85.44810 0.085233651 0.02757605 1759.5 0.17108983 -0.08468185 2.585516
## 2 98.47850 0.015498741 0.07988916 142.0 0.20849496 -0.11978610 2.438651
## 3 -336.29279 0.004213435 0.03451059 7317.7 -0.28636056 -0.09378094 2.640524
## 4 -98.21554 -0.022149587 -0.09928628 9650.9 0.08015787 -0.38231894 2.520807
## 5 -173.81610 0.115474455 0.03658240 5833.8 -0.21637227 0.11705446 2.681148
## 6 109.13980 0.026784593 -0.06232975 7206.4 0.60996115 0.18435949 2.700062
## CCR7_150 CD45RA_150 CD3_150 CD49b_150 CD4_150 CD69_150 CD103_150
## 1 4.138186 2.714379 3.960085 -0.5831565 2.831010 0.45620728 0.09253515
## 2 3.694022 2.865701 3.726255 0.1908505 3.173416 0.08565834 0.26558015
## 3 3.619215 2.701740 3.762954 0.2957661 3.110011 0.02331229 0.11572015
## 4 3.983945 2.910694 3.889792 0.4169627 3.220305 -0.12226621 -0.32822508
## 5 4.009308 2.625654 3.622586 -0.3594562 3.012274 0.60314041 0.12263663
## 6 3.731990 2.855382 3.526417 -0.6445128 3.026464 0.14768833 -0.20805925
## CSPLR_ST CSPLR_HsneDataX CSPLR_HsneDataY clusters_HSNE sampleID
## 1 6 -0.2343911 12.55609 CD4-1 CD3_W
## 2 0 -11.1102962 -21.56930 CD4-1 CD3_B
## 3 3 -0.2343911 12.55609 CD4-1 CD3_N
## 4 3 -0.2343911 12.55609 CD4-1 CD3_N
## 5 6 -0.2343911 12.55609 CD4-1 CD3_W
## 6 2 3.3519292 20.76395 CD4-1 CD3_R
## clusters_flowsom DC1 DC2 DC3 umap_1 umap_2
## 1 5 0.001300343 0.0005537734 0.0003425180 3.810218 1.3263930
## 2 5 0.001370561 0.0003254344 0.0002585017 3.800768 0.8864915
## 3 2 0.001196715 0.0005467035 0.0003490013 3.750142 1.1966473
## 4 5 0.001471539 0.0003559209 0.0002495618 4.043976 1.0863283
## 5 2 0.001181110 0.0006063208 0.0003737570 3.738265 1.2708768
## 6 5 0.001446349 0.0004158300 0.0002839620 4.020594 1.0154871
## merged_HSNE curve1 curve2 curve3 clusters_phenograph
## 1 Naive 3.547367 3.563246 3.579921 10
## 2 Naive 3.497071 3.512359 3.531119 10
## 3 Naive 3.603737 3.620266 3.639434 10
## 4 Naive 3.433274 3.448976 3.464703 10
## 5 Naive 3.610096 3.627680 3.644634 10
## 6 Naive 3.516714 3.529806 3.543224 10
Define a function for visualization of the diffusion map
viz.dm <- function(dat,dr, param.name,limits=NULL){
ColVal <- dat[,param.name]
if(is.null(limits)){
Lim <- quantile(ColVal,probs=seq(0,1,0.01))[c(2,100)]
p <- ggplot(dat, aes(x = DC1, y =DC2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=Lim, oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
} else {
p <- ggplot(dat, aes(x = DC1, y = DC2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=c(limits[1],limits[2]), oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
}
p
}
Define the parameters for which you would like generate plots
First check the order of columns
colnames(df)
## [1] "FSC.A" "FSC.H" "FSC.W"
## [4] "SSC.A" "SSC.H" "SSC.W"
## [7] "CD95" "CD8" "CD27"
## [10] "CXCR4" "CCR7" "LIVE.DEAD"
## [13] "CD4" "CD45RA" "CD3"
## [16] "CD49b" "CD14.19" "CD69"
## [19] "CD103" "TIME" "CD95_150"
## [22] "CD8_150" "CD27_150" "CCR7_150"
## [25] "CD45RA_150" "CD3_150" "CD49b_150"
## [28] "CD4_150" "CD69_150" "CD103_150"
## [31] "CSPLR_ST" "CSPLR_HsneDataX" "CSPLR_HsneDataY"
## [34] "clusters_HSNE" "sampleID" "clusters_flowsom"
## [37] "DC1" "DC2" "DC3"
## [40] "umap_1" "umap_2" "merged_HSNE"
## [43] "curve1" "curve2" "curve3"
## [46] "clusters_phenograph"
Provide the column numbers
parameters = colnames(df)[c(7:9, 11, 13:16,18,19)]
Visualize the parameters on the diffusion map
for (i in 1:length(parameters)){
p=viz.dm(dat=df, param.name = parameters[i])
print(p)
}
If you are not satisfied with the color scale, adjust the limits argument
viz.dm(dat=df,param.name='CD8', limits=c(-0.36,4.32))
viz.dm(dat=df, param.name='CD103', limits=c(-0.38, 3.16))
label_HSNE_dm <- df%>%group_by(clusters_HSNE)%>%select(DC1, DC2)%>%summarize_all(mean)
label_flowsom_dm <- df%>%group_by(clusters_flowsom)%>%select(DC1, DC2)%>%summarize_all(mean)
label_pheno_dm <- df%>%group_by(clusters_phenograph)%>%select(DC1, DC2)%>%summarize_all(mean)
ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_HSNE)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_HSNE), data=label_HSNE_dm)+guides(colour=FALSE)+ggtitle('clusters HSNE')
ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_flowsom)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_flowsom), data=label_flowsom_dm)+guides(colour=FALSE)+ggtitle('clusters flowsom')
ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_phenograph)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_phenograph), data=label_pheno_dm)+guides(colour=FALSE)+ggtitle('clusters phenograph')
Define a function for visualization of the umap
Visualize the parameters on the umap
for (i in 1:length(parameters)){
p=viz.umap(dat=df, param.name = parameters[i])
print(p)
}
If you are not satisfied with the color scale, adjust the limits argument
viz.umap(dat=df,param.name='CD8', limits=c(-0.36,4.32))
viz.umap(dat=df, param.name='CD103', limits=c(-0.38, 3.16))
label_HSNE_umap <- df%>%group_by(clusters_HSNE)%>%select(umap_1, umap_2)%>%summarize_all(mean)
label_flowsom_umap <- df%>%group_by(clusters_flowsom)%>%select(umap_1, umap_2)%>%summarize_all(mean)
label_pheno_umap <- df%>%group_by(clusters_phenograph)%>%select(umap_1, umap_2)%>%summarize_all(mean)
ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_HSNE)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_HSNE), data=label_HSNE_umap)+guides(colour=FALSE)+ggtitle('clusters HSNE')
ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_flowsom)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_flowsom), data=label_flowsom_umap)+guides(colour=FALSE)+ggtitle('clusters flowsom')
ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_phenograph)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_phenograph), data=label_pheno_umap)+guides(colour=FALSE)+ggtitle('clusters phenograph')
Fill in for which clusters you would like to calculate the frequencies (either clusters_HSNE, clusters_flowsom, or clusters_phenograph)
counts <- as.data.frame.matrix(table(df$sampleID,df$clusters_HSNE))
Calculate the percentage of a certain sample present in a certain cluster
counts_percofsample = counts/rowSums(counts)*100
counts_percofsample$total <- rowSums(counts)
counts_percofsample$sampleID <- row.names(counts_percofsample)
counts_percofsample <- melt(counts_percofsample, id.vars=c('sampleID', 'total'), variable.name='cluster', value.name='frequency')
Plot the frequencies in a boxplot (frequency = % of sample in cluster)
ggplot(counts_percofsample, aes(x=reorder(cluster, frequency, FUN=median), y=frequency))+ geom_boxplot(position='dodge2', outlier.shape=NA)+geom_jitter(aes(colour=sampleID), size=2)+xlab('cluster')
Plot the frequencies in bargraphs (frequency = % of sample in cluster)
If you would like to use other clusters, adjust ‘merged_HSNE’ to clusters_flowsom’, for instance
If there are more or less lineages identified adjust the number of curve columns
pseudotimevalues <- df%>%select(c(DC1, DC2,clusterID=merged_HSNE, curve1, curve2, curve3))
Preparation of table and color palette for visualization of lineages
#reshape table
pseudotimevalues <- melt(pseudotimevalues, id.vars=c('clusterID', 'DC1', 'DC2'), variable.name='lineage', value.name = 'pseudotime')
#rename curve to lineage
pseudotimevalues$lineage <- gsub('curve','lineage', pseudotimevalues$lineage)
#exclude cells with NA pseudotimevalues (those cells are not present in all lineages and have NA values for the lineages in which they are absent)
pseudotimevaluesexclNA <- pseudotimevalues%>%filter(pseudotime!='NA')
#generate colorpalette
colors <- colorRampPalette(rev(brewer.pal(11, 'Spectral'))[-6])(100)
Plot the lineages: cells are colored by pseudotime
ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=DC1, y=DC2))+geom_point(aes(color=pseudotime),size=0.1, alpha=0.3) +facet_wrap(~lineage)+scale_color_gradientn(colours=colors)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())
Plot the lineages: cells are colored by clusterID
ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=DC1, y=DC2))+geom_point(aes(color=clusterID),size =0.1)+facet_wrap(~lineage)+theme_bw()+ guides(colour = guide_legend(override.aes = list(size=5)))
Jitterplot per lineage: cells are colored by clusterID
ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=pseudotime, y=lineage)) + geom_jitter(aes(color=clusterID), size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ guides(colour = guide_legend(override.aes = list(size=5)))